# -*- coding: utf-8 -*-
"""
This code generates seeds for the water LDA. Run this code only once, or else
the seeds will be overwritten and you will not be able to replicate your results

@author: samir
"""
import pandas as pd
import topicmodels
import topicmodels.preprocess
import os
import numpy as np
from paths import target_path, water_LDA_path #importing target_path from paths.py


def main():
    print("Running: Water LDA Seeds")

if __name__ == "__main__":
    main()

# Change to relevant directory
if os.path.exists(target_path):
    os.chdir(target_path)
else:
    print(f"Warning: {target_path} does not exist!")

#Import data
dataframe = pd.read_csv("Water_Subset/Processed_Data/Water_Complaints_lemmatized.csv", encoding="utf-8")
dataframe = dataframe.replace(np.nan, '', regex=True)
docsobj = topicmodels.RawDocs(dataframe.IncidentDescriptionLemma, "long")

#Preprocess lemmatized complaints
docsobj.token_clean(1)
docsobj.stopword_remove("tokens")

###Change directory to save LDA Outputs in correct folder
os.chdir(water_LDA_path)
docsobj.term_rank("tokens")

#tf-idf removal
docsobj.rank_remove("tfidf", "tokens", docsobj.tfidf_ranking[7500][1])

###Generate seed needed for LDA
#Seed 1
ldaobj = topicmodels.LDA.LDAGibbs(docsobj.tokens, 10)
seed = ldaobj.topic_seed
np.savetxt('7500_tfidf_10_Topics/seed.csv', seed)